In [129]:
"""Workbook to analyse encode predictions.
"""
# pylint: disable=import-error, redefined-outer-name, use-dict-literal, too-many-lines, too-many-branches, duplicate-code
Out[129]:
'Workbook to analyse encode predictions.\n'

SETUP¶

In [130]:
%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
In [131]:
from __future__ import annotations

import copy
import functools
import gc
from pathlib import Path
from typing import Dict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.io as pio
pio.renderers.default='notebook'
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import display
from sklearn.metrics import confusion_matrix, f1_score

from epi_ml.core.confusion_matrix import ConfusionMatrixWriter
from epi_ml.utils.classification_merging_utils import merge_dataframes
from epi_ml.utils.notebooks.paper.metrics_per_assay import MetricsPerAssay
from epi_ml.utils.notebooks.paper.paper_utilities import (
    ASSAY,
    ASSAY_ORDER,
    CELL_TYPE,
    EPIATLAS_16_CT,
    LIFE_STAGE,
    SEX,
    IHECColorMap,
    MetadataHandler,
    SplitResultsHandler,
    set_file_id,
)
In [132]:
CANCER = "harmonized_sample_cancer_high"
BIOMAT = "harmonized_biomaterial_type"
CORE_ASSAYS = ASSAY_ORDER[0:7]
In [133]:
base_dir = Path.home() / "Projects/epiclass/output/paper"
base_data_dir = base_dir / "data"
base_fig_dir = base_dir / "figures"
table_dir = base_dir / "tables"
paper_dir = base_dir

if not base_fig_dir.exists():
    raise FileNotFoundError(f"Directory {base_fig_dir} does not exist.")
In [134]:
metadata_handler = MetadataHandler(paper_dir)
split_results_handler = SplitResultsHandler()
In [135]:
IHECColorMap = IHECColorMap(base_fig_dir)
assay_colors = IHECColorMap.assay_color_map
In [136]:
encode_metadata_dir = base_data_dir / "metadata" / "encode"
In [137]:
encode_predictions_dir = base_data_dir / "training_results" / "predictions" / "encode"
In [138]:
for path in [encode_metadata_dir, encode_predictions_dir]:
    if not path.exists():
        raise FileNotFoundError(f"Directory {path} does not exist.")

Get complete metadata¶

Was created in encode_metadata_creation.ipynb using new web API downloads.

In [139]:
full_metadata_path = (
    encode_metadata_dir / "new_meta" / "encode_full_metadata_2025-02_no_revoked.csv"
)
complete_metadata_df = pd.read_csv(full_metadata_path, low_memory=False)

5 categories value counts

In [140]:
for cat in [
    ASSAY,
    CELL_TYPE,
    "cell_type",
    "epiclass_sample_ontology",
    "donor_life_stage",
    "donor_sex",
    "cancer_status",
    BIOMAT,
]:
    try:
        print(complete_metadata_df[cat].value_counts(dropna=False), "\n")
    except KeyError:
        print(f"Column {cat} not found.")
assay_epiclass
non-core    3593
input       1981
rna_seq     1188
h3k4me3      705
h3k27ac      623
mrna_seq     604
h3k27me3     585
h3k4me1      556
h3k36me3     554
h3k9me3      536
ctcf         468
wgbs         252
Name: count, dtype: int64

harmonized_sample_ontology_intermediate
NaN                                  2559
epithelial cell derived cell line    1886
lymphoma or leukaemia cell line      1354
cancer cell line                      797
mesoderm-derived structure            764
T cell                                480
stem cell derived cell line           399
digestive system                      339
B cell derived cell line              298
endoderm-derived structure            255
colon                                 174
brain                                 159
connective tissue cell                152
muscle organ                          148
neural cell                           138
pancreas                              116
fibroblast derived cell line          114
mucosa                                113
lymphocyte of B lineage               107
epithelium                             98
liver                                  85
keratinocyte                           83
monocyte                               81
embryonic cell (metazoa)               73
meso-epithelial cell                   67
muscle cell                            62
placenta                               58
nervous system                         55
hematopoietic cell                     49
lung                                   48
natural killer cell                    46
ovary                                  44
mammary gland epithelial cell          43
suprapubic skin                        42
muscle precursor cell                  39
peripheral blood mononuclear cell      34
uterus                                 32
extraembryonic cell                    31
kidney                                 30
testis                                 29
neural progenitor cell                 28
melanocyte                             24
hepatocyte                             22
trophoblast                            20
KMS-11                                 16
neutrophil                             13
secretory cell                         10
stem cell                              10
kidney epithelial cell                  7
endo-epithelial cell                    7
erythroid lineage cell                  5
mole                                    2
Name: count, dtype: int64

Column cell_type not found.
epiclass_sample_ontology
NaN                                  2559
epithelial cell derived cell line    1886
lymphoma or leukaemia cell line      1354
cancer cell line                      797
mesoderm-derived structure            764
T cell                                480
stem cell derived cell line           399
digestive system                      339
B cell derived cell line              298
endoderm-derived structure            255
colon                                 174
brain                                 159
connective tissue cell                152
muscle organ                          148
neural cell                           138
pancreas                              116
fibroblast derived cell line          114
mucosa                                113
lymphocyte of B lineage               107
epithelium                             98
liver                                  85
keratinocyte                           83
monocyte                               81
embryonic cell (metazoa)               73
meso-epithelial cell                   67
muscle cell                            62
placenta                               58
nervous system                         55
hematopoietic cell                     49
lung                                   48
natural killer cell                    46
ovary                                  44
mammary gland epithelial cell          43
suprapubic skin                        42
muscle precursor cell                  39
peripheral blood mononuclear cell      34
uterus                                 32
extraembryonic cell                    31
kidney                                 30
testis                                 29
neural progenitor cell                 28
melanocyte                             24
hepatocyte                             22
trophoblast                            20
KMS-11                                 16
neutrophil                             13
secretory cell                         10
stem cell                              10
kidney epithelial cell                  7
endo-epithelial cell                    7
erythroid lineage cell                  5
mole                                    2
Name: count, dtype: int64

Column donor_life_stage not found.
Column donor_sex not found.
Column cancer_status not found.
harmonized_biomaterial_type
cell line         5537
primary tissue    3611
primary cell      1856
other              600
NaN                 41
Name: count, dtype: int64

In [141]:
if "cancer_status" not in complete_metadata_df.columns:
    chip_path = (
        encode_metadata_dir / "old_meta" / "encode_metadata_2023-10-25_clean-v2.csv"
    )
    chip_metadata_df = pd.read_csv(chip_path, low_memory=False)

    cancer_status = chip_metadata_df[["md5sum", "cancer_status"]]
    complete_metadata_df = complete_metadata_df.merge(
        cancer_status, how="left", left_on="FILE_accession", right_on="md5sum"
    )
    complete_metadata_df.drop(columns="md5sum", inplace=True)
    del chip_metadata_df

    gc.collect()

Merge all available predictions¶

In [142]:
chip_pred_dfs = {}
nb_chip_files = 9619
for folder in encode_predictions_dir.glob("*1l_3000n"):
    if not folder.is_dir():
        continue
    cat = folder.name.split("_1l_3000n")[0]  # [category]_1l_3000n
    pred_file = list(folder.rglob("complete_no_valid_oversample*chip.csv"))[0]
    encode_df = pd.read_csv(pred_file)
    encode_df = set_file_id(encode_df)
    chip_pred_dfs[cat] = encode_df
    print(cat, encode_df.shape)
    assert encode_df.shape[0] == nb_chip_files

if len(chip_pred_dfs) != 6:
    raise ValueError(f"{len(chip_pred_dfs)} != 6")
harmonized_biomaterial_type (9619, 7)
harmonized_donor_life_stage (9619, 8)
harmonized_sample_ontology_intermediate (9619, 19)
assay_epiclass (9619, 14)
harmonized_donor_sex (9619, 6)
harmonized_sample_cancer_high (9619, 5)
In [143]:
pred_dfs_rna = {}
nb_rna_files = 1790
for folder in encode_predictions_dir.glob("*1l_3000n"):
    if not folder.is_dir():
        continue
    cat = folder.name.split("_1l_3000n")[0]  # [category]_1l_3000n
    pred_file = list(folder.rglob("complete_no_valid_oversample*rna.csv"))[0]
    encode_df = pd.read_csv(pred_file)
    encode_df = set_file_id(encode_df)
    pred_dfs_rna[cat] = encode_df
    print(cat, encode_df.shape)
    assert encode_df.shape[0] == nb_rna_files

if len(pred_dfs_rna) != 6:
    raise ValueError(f"{len(pred_dfs_rna)} != 6")
harmonized_biomaterial_type (1790, 7)
harmonized_donor_life_stage (1790, 8)
harmonized_sample_ontology_intermediate (1790, 19)
assay_epiclass (1790, 14)
harmonized_donor_sex (1790, 6)
harmonized_sample_cancer_high (1790, 5)
In [144]:
pred_dfs_wgb = {}
nb_wgb_files = 252
for folder in encode_predictions_dir.glob("*1l_3000n"):
    if not folder.is_dir():
        continue
    cat = folder.name.split("_1l_3000n")[0]  # [category]_1l_3000n
    pred_file = list(folder.rglob("complete_no_valid_oversample*wgbs.csv"))[0]
    encode_df = pd.read_csv(pred_file)
    encode_df = set_file_id(encode_df)
    pred_dfs_wgb[cat] = encode_df
    print(cat, encode_df.shape)
    assert encode_df.shape[0] == nb_wgb_files

if len(pred_dfs_wgb) != 6:
    raise ValueError(f"{len(pred_dfs_wgb)} != 6")
harmonized_biomaterial_type (252, 7)
harmonized_donor_life_stage (252, 8)
harmonized_sample_ontology_intermediate (252, 19)
assay_epiclass (252, 14)
harmonized_donor_sex (252, 6)
harmonized_sample_cancer_high (252, 5)
In [145]:
for pred_dfs in [chip_pred_dfs, pred_dfs_rna, pred_dfs_wgb]:
    pred_dfs[f"{ASSAY}_11c"] = pred_dfs[ASSAY]
    del pred_dfs[ASSAY]
In [146]:
concat_pred_dfs = {}
for category in [f"{ASSAY}_11c", CELL_TYPE, SEX, LIFE_STAGE, CANCER, BIOMAT]:
    concat_pred_dfs[category] = pd.concat(
        [
            chip_pred_dfs[category],
            pred_dfs_rna[category],
            pred_dfs_wgb[category],
        ]
    )
In [147]:
assay_7c_path = list(
    (encode_predictions_dir / "assay_epiclass_1l_3000n" / "7c").glob(
        "**/complete_no_valid_oversample*.csv"
    )
)
assert len(assay_7c_path) == 1
assay_7c_df = pd.read_csv(assay_7c_path[0])
assay_7c_df = set_file_id(assay_7c_df)
In [148]:
concat_pred_dfs[f"{ASSAY}_7c"] = assay_7c_df
In [149]:
same_col_names = 2
# Make all different columns have unique relevant names except for the pred vector
for cat, df in concat_pred_dfs.items():
    try:
        df.drop(columns=["Same?"], inplace=True)
    except KeyError:
        pass

    df.insert(3, "Max pred", value=df.iloc[:, 1 + same_col_names :].max(axis=1))

    if cat in [f"{ASSAY}_11c", f"{ASSAY}_7c"]:
        old_names = df.columns[1:]
    else:
        old_names = df.columns[1 : 2 + same_col_names]

    new_names = [f"{old_name} ({cat})" for old_name in old_names]
    df.rename(columns=dict(zip(old_names, new_names)), inplace=True)

    df.set_index("md5sum", inplace=True)

    concat_pred_dfs[cat] = df
In [150]:
df_order = [f"{ASSAY}_7c", f"{ASSAY}_11c", CELL_TYPE, SEX, LIFE_STAGE, CANCER, BIOMAT]
df_list = [concat_pred_dfs[cat] for cat in df_order]
full_merged_df = functools.reduce(merge_dataframes, df_list)
In [151]:
preds_plus_metadata_df: pd.DataFrame = full_merged_df.merge(
    complete_metadata_df,
    left_index=True,
    right_on="FILE_accession",
    how="left",
    suffixes=("", "_delete"),
)

for col in preds_plus_metadata_df.columns:
    if col.endswith("_delete"):
        print(col)
        preds_plus_metadata_df.drop(columns=col, inplace=True)
In [152]:
assert isinstance(preds_plus_metadata_df, pd.DataFrame)  # pylance being weird

meta_col_order = [
    col for col in complete_metadata_df.columns if col in preds_plus_metadata_df.columns
]
results_col_order = [
    col for col in full_merged_df.columns if col in preds_plus_metadata_df.columns
]

new_order = results_col_order + meta_col_order
preds_plus_metadata_df = preds_plus_metadata_df.loc[:, new_order]
In [153]:
for pairs in [
    (SEX, SEX),
    (LIFE_STAGE, LIFE_STAGE),
    (CANCER, "cancer_status"),
    (CELL_TYPE, "epiclass_sample_ontology"),
    (BIOMAT, BIOMAT),
]:
    name1 = f"True class ({pairs[0]})"
    name2 = pairs[1]
    print(name1, name2)
    preds_plus_metadata_df[name1] = preds_plus_metadata_df[name2]
    preds_plus_metadata_df[pairs[0]] = preds_plus_metadata_df[pairs[1]]
True class (harmonized_donor_sex) harmonized_donor_sex
True class (harmonized_donor_life_stage) harmonized_donor_life_stage
True class (harmonized_sample_cancer_high) cancer_status
True class (harmonized_sample_ontology_intermediate) epiclass_sample_ontology
True class (harmonized_biomaterial_type) harmonized_biomaterial_type
In [154]:
preds_plus_metadata_df[f"True class ({ASSAY}_7c)"] = preds_plus_metadata_df[ASSAY]
preds_plus_metadata_df[f"True class ({ASSAY}_11c)"] = preds_plus_metadata_df[ASSAY]

Removing revoked files (no available metadata)

In [155]:
preds_plus_metadata_df = preds_plus_metadata_df[
    ~preds_plus_metadata_df["in_epiatlas"].isna()
]
print(preds_plus_metadata_df.shape)
(11643, 276)
In [156]:
logdir = base_data_dir / "training_results" / "predictions" / "encode"
filename = logdir / "complete_encode_predictions_augmented_2025-02_metadata.csv.gz"

preds_plus_metadata_df.to_csv(filename, index=False, compression="gzip")

Remove datasets overlapping with EpiATLAS

In [157]:
all_preds_no_epiatlas = preds_plus_metadata_df[
    ~preds_plus_metadata_df["in_epiatlas"].astype(bool)
]
print(all_preds_no_epiatlas.shape)
(8777, 276)
In [158]:
all_preds_no_epiatlas[ASSAY].value_counts(dropna=False)
Out[158]:
assay_epiclass
non-core    3593
input       1284
rna_seq      948
mrna_seq     604
ctcf         468
h3k4me3      379
h3k27ac      366
h3k27me3     294
h3k4me1      257
h3k36me3     245
h3k9me3      241
wgbs          98
Name: count, dtype: int64

Cell type metrics¶

In [159]:
cell_type_df = preds_plus_metadata_df.copy(deep=True)
cell_type_df = cell_type_df[cell_type_df[CELL_TYPE].isin(EPIATLAS_16_CT)]
print(cell_type_df.shape)
display(cell_type_df[CELL_TYPE].value_counts(dropna=False))
(1842, 276)
harmonized_sample_ontology_intermediate
mesoderm-derived structure       764
endoderm-derived structure       255
colon                            174
brain                            159
connective tissue cell           152
muscle organ                     148
monocyte                          81
mammary gland epithelial cell     43
extraembryonic cell               31
hepatocyte                        22
neutrophil                        13
Name: count, dtype: int64
In [160]:
core_assays = ASSAY_ORDER + ["mrna_seq"]
cell_type_core_df = cell_type_df[cell_type_df[ASSAY].isin(core_assays)]
cell_type_noncore_df = cell_type_df[~cell_type_df[ASSAY].isin(core_assays)]
In [161]:
for df in [cell_type_core_df, cell_type_noncore_df]:
    print(df.shape)
    N = df.shape[0]
    display(df[CELL_TYPE].value_counts(dropna=False))
    display(df["assay"].value_counts(dropna=False))
(1580, 276)
harmonized_sample_ontology_intermediate
mesoderm-derived structure       685
endoderm-derived structure       215
brain                            149
colon                            141
muscle organ                     137
connective tissue cell           105
monocyte                          73
mammary gland epithelial cell     35
extraembryonic cell               15
hepatocyte                        13
neutrophil                        12
Name: count, dtype: int64
assay
input       281
rna_seq     192
h3k4me3     164
h3k4me1     153
h3k36me3    152
h3k9me3     146
h3k27ac     142
h3k27me3    138
mrna_seq    128
wgbs         84
Name: count, dtype: int64
(262, 276)
harmonized_sample_ontology_intermediate
mesoderm-derived structure       79
connective tissue cell           47
endoderm-derived structure       40
colon                            33
extraembryonic cell              16
muscle organ                     11
brain                            10
hepatocyte                        9
monocyte                          8
mammary gland epithelial cell     8
neutrophil                        1
Name: count, dtype: int64
assay
ctcf               116
polr2a              29
h3k9ac              26
polr2aphosphos5     16
h3k4me2              9
ep300                9
h2afz                8
h3k79me2             7
h4k20me1             6
ezh2                 4
h2bk12ac             3
h3k14ac              3
h3k18ac              3
h3k4ac               3
h3k23ac              3
h4k8ac               3
h3k79me1             2
h2bk120ac            2
h4k91ac              2
h2bk5ac              2
h2ak5ac              2
ezh2phosphot487      1
h2bk15ac             1
h3k9me2              1
h4k12ac              1
Name: count, dtype: int64
In [162]:
for df, name in zip([cell_type_core_df, cell_type_noncore_df], ["core", "noncore"]):
    print(name)
    full_N = df.shape[0]
    for min_pred in [0, 0.6, 0.8, 0.9]:
        acc, f1, N = MetricsPerAssay.compute_metrics(
            df=df, min_pred=min_pred, cat_label=CELL_TYPE
        )
        print(
            f"Min pred: {min_pred}, N: {N} ({N/full_N:.2%}), Acc: {acc:.3f}, F1: {f1:.3f}"
        )
    print()
core
Min pred: 0, N: 1580 (100.00%), Acc: 0.873, F1: 0.564
Min pred: 0.6, N: 1292 (81.77%), Acc: 0.966, F1: 0.836
Min pred: 0.8, N: 1155 (73.10%), Acc: 0.973, F1: 0.921
Min pred: 0.9, N: 998 (63.16%), Acc: 0.979, F1: 0.921

noncore
Min pred: 0, N: 262 (100.00%), Acc: 0.851, F1: 0.657
Min pred: 0.6, N: 173 (66.03%), Acc: 0.971, F1: 0.877
Min pred: 0.8, N: 130 (49.62%), Acc: 0.992, F1: 0.967
Min pred: 0.9, N: 107 (40.84%), Acc: 1.000, F1: 1.000

Other cell type trainings metrics¶

Includes cell type classifiers trained with single assays. (e.g. only h3k4me1 files)

In [163]:
pred_folder = (
    base_data_dir
    / f"training_results/dfreeze_v2/hg38_100kb_all_none/{CELL_TYPE}_1l_3000n/complete-no_valid-oversampling"
)
if not pred_folder.exists():
    raise FileNotFoundError(f"Directory {pred_folder} does not exist.")
In [164]:
other_ct_dfs = {}
for folder in pred_folder.glob("*"):
    if not folder.is_dir():
        print(f"Skipping {folder}")
        continue
    pred_file = list(folder.glob("predictions/*.csv"))

    if len(pred_file) > 1:
        print(f"More than one prediction file found in {folder}")
        continue

    if len(pred_file) == 0:
        print(f"No prediction file found in {folder}")
        continue

    pred_file = pred_file[0]

    pred_df = pd.read_csv(pred_file)
    name = folder.name.replace("complete_no_valid_oversample_", "")

    for col in ["True class", "Predicted class"]:
        pred_df[col] = pred_df[col].str.lower()

    other_ct_dfs[name] = pred_df
In [165]:
other_ct_dfs.keys()
Out[165]:
dict_keys(['h3k4me3_only', 'assay11-ct16', 'h3k9me3_only', 'h3k36me3_only', 'assay7-ct16', 'h3k27ac_only', 'input_only', 'h3k27me3_only', 'h3k4me1_only', 'assay7-ct19'])
In [166]:
def compute_cell_type_acc(
    metadata_df: pd.DataFrame,
    pred_dfs_dict: Dict[str, pd.DataFrame],
    min_pred: float = 0.6,
) -> None:
    """Compute the accuracy of the predictions for the 16 cell types.
    Inner meger of the metadata and predictions is performed.
    """
    meta_df = metadata_df[metadata_df[CELL_TYPE].isin(EPIATLAS_16_CT)].copy()

    # print("Assay counts for 16 cell types")
    # values_count = meta_df["Assay"].value_counts(dropna=False)
    # display(values_count)
    # display_perc(values_count / values_count.sum() * 100)

    # print("Cell types distribution")
    # values_count = meta_df[CELL_TYPE].value_counts(dropna=False)
    # display(values_count)
    # display_perc(values_count / values_count.sum() * 100)

    for name, pred_df in sorted(pred_dfs_dict.items()):
        print(name)
        pred_w_ct = pred_df.merge(
            meta_df, left_on="md5sum", right_on="FILE_accession", how="inner"
        )
        N = pred_w_ct.shape[0]

        # Calculate results for all predictions
        true, pred = pred_w_ct[CELL_TYPE], pred_w_ct["Predicted class"]

        total_correct = (true == pred).sum()
        acc = total_correct / N
        f1 = f1_score(true, pred, labels=pred.unique(), average="macro")

        print(f"Acc (pred>0.0): {total_correct}/{N} ({acc:.2%})")
        print(f"F1 (pred>0.0): {f1:.2f}")

        # Calculate results for predictions with max_pred
        pred_w_ct_filtered = pred_w_ct[pred_w_ct["Max pred"] > min_pred]
        true, pred = pred_w_ct_filtered[CELL_TYPE], pred_w_ct_filtered["Predicted class"]

        total_correct_filtered = (true == pred).sum()
        perc_filtered = total_correct_filtered / pred_w_ct_filtered.shape[0]

        f1 = f1_score(true, pred, labels=pred.unique(), average="macro")

        print(
            f"Acc (pred>{min_pred:.1f}): {total_correct_filtered}/{pred_w_ct_filtered.shape[0]} ({perc_filtered:.2%})"
        )
        diff = N - pred_w_ct_filtered.shape[0]
        print(f"F1 (pred>{min_pred}): {f1:.2f}")
        print(f"Samples ignored at {min_pred:.1f}: {diff} ({diff/N:.2%})\n")
In [167]:
mask_core_assays = complete_metadata_df[ASSAY].isin(core_assays)
non_core_metadata_df = complete_metadata_df[~mask_core_assays]
core_metadata_df = complete_metadata_df[mask_core_assays]

compute_cell_type_acc(non_core_metadata_df, other_ct_dfs)
print("\n")
compute_cell_type_acc(core_metadata_df, other_ct_dfs)
assay11-ct16
Acc (pred>0.0): 223/262 (85.11%)
F1 (pred>0.0): 0.66
Acc (pred>0.6): 168/173 (97.11%)
F1 (pred>0.6): 0.88
Samples ignored at 0.6: 89 (33.97%)

assay7-ct16
Acc (pred>0.0): 199/262 (75.95%)
F1 (pred>0.0): 0.56
Acc (pred>0.6): 173/185 (93.51%)
F1 (pred>0.6): 0.93
Samples ignored at 0.6: 77 (29.39%)

assay7-ct19
Acc (pred>0.0): 134/262 (51.15%)
F1 (pred>0.0): 0.49
Acc (pred>0.6): 105/176 (59.66%)
F1 (pred>0.6): 0.66
Samples ignored at 0.6: 86 (32.82%)

h3k27ac_only
Acc (pred>0.0): 169/262 (64.50%)
F1 (pred>0.0): 0.48
Acc (pred>0.6): 85/90 (94.44%)
F1 (pred>0.6): 0.93
Samples ignored at 0.6: 172 (65.65%)

h3k27me3_only
Acc (pred>0.0): 34/262 (12.98%)
F1 (pred>0.0): 0.04
Acc (pred>0.6): 17/143 (11.89%)
F1 (pred>0.6): 0.10
Samples ignored at 0.6: 119 (45.42%)

h3k36me3_only
Acc (pred>0.0): 170/262 (64.89%)
F1 (pred>0.0): 0.49
Acc (pred>0.6): 66/75 (88.00%)
F1 (pred>0.6): 0.87
Samples ignored at 0.6: 187 (71.37%)

h3k4me1_only
Acc (pred>0.0): 164/262 (62.60%)
F1 (pred>0.0): 0.47
Acc (pred>0.6): 80/87 (91.95%)
F1 (pred>0.6): 0.97
Samples ignored at 0.6: 175 (66.79%)

h3k4me3_only
Acc (pred>0.0): 165/262 (62.98%)
F1 (pred>0.0): 0.50
Acc (pred>0.6): 144/173 (83.24%)
F1 (pred>0.6): 0.68
Samples ignored at 0.6: 89 (33.97%)

h3k9me3_only
Acc (pred>0.0): 39/262 (14.89%)
F1 (pred>0.0): 0.05
Acc (pred>0.6): 16/142 (11.27%)
F1 (pred>0.6): 0.20
Samples ignored at 0.6: 120 (45.80%)

input_only
Acc (pred>0.0): 49/262 (18.70%)
F1 (pred>0.0): 0.11
Acc (pred>0.6): 42/205 (20.49%)
F1 (pred>0.6): 0.19
Samples ignored at 0.6: 57 (21.76%)



assay11-ct16
Acc (pred>0.0): 1078/1176 (91.67%)
F1 (pred>0.0): 0.74
Acc (pred>0.6): 993/1022 (97.16%)
F1 (pred>0.6): 0.92
Samples ignored at 0.6: 154 (13.10%)

assay7-ct16
Acc (pred>0.0): 1070/1176 (90.99%)
F1 (pred>0.0): 0.73
Acc (pred>0.6): 962/981 (98.06%)
F1 (pred>0.6): 0.93
Samples ignored at 0.6: 195 (16.58%)

assay7-ct19
Acc (pred>0.0): 1080/1176 (91.84%)
F1 (pred>0.0): 0.65
Acc (pred>0.6): 941/961 (97.92%)
F1 (pred>0.6): 0.85
Samples ignored at 0.6: 215 (18.28%)

h3k27ac_only
Acc (pred>0.0): 564/1176 (47.96%)
F1 (pred>0.0): 0.34
Acc (pred>0.6): 372/410 (90.73%)
F1 (pred>0.6): 0.73
Samples ignored at 0.6: 766 (65.14%)

h3k27me3_only
Acc (pred>0.0): 257/1176 (21.85%)
F1 (pred>0.0): 0.16
Acc (pred>0.6): 182/416 (43.75%)
F1 (pred>0.6): 0.51
Samples ignored at 0.6: 760 (64.63%)

h3k36me3_only
Acc (pred>0.0): 662/1176 (56.29%)
F1 (pred>0.0): 0.38
Acc (pred>0.6): 482/612 (78.76%)
F1 (pred>0.6): 0.70
Samples ignored at 0.6: 564 (47.96%)

h3k4me1_only
Acc (pred>0.0): 597/1176 (50.77%)
F1 (pred>0.0): 0.32
Acc (pred>0.6): 518/670 (77.31%)
F1 (pred>0.6): 0.65
Samples ignored at 0.6: 506 (43.03%)

h3k4me3_only
Acc (pred>0.0): 578/1176 (49.15%)
F1 (pred>0.0): 0.35
Acc (pred>0.6): 449/576 (77.95%)
F1 (pred>0.6): 0.59
Samples ignored at 0.6: 600 (51.02%)

h3k9me3_only
Acc (pred>0.0): 279/1176 (23.72%)
F1 (pred>0.0): 0.15
Acc (pred>0.6): 202/504 (40.08%)
F1 (pred>0.6): 0.44
Samples ignored at 0.6: 672 (57.14%)

input_only
Acc (pred>0.0): 349/1176 (29.68%)
F1 (pred>0.0): 0.20
Acc (pred>0.6): 274/721 (38.00%)
F1 (pred>0.6): 0.26
Samples ignored at 0.6: 455 (38.69%)

Confusion matrices¶

In [168]:
conf_matrix_logdir = (
    base_fig_dir / "encode_predictions" / "confusion_matrices" / CELL_TYPE / "core"
)
if not conf_matrix_logdir.exists():
    conf_matrix_logdir.mkdir(parents=True)
In [169]:
meta_df = core_metadata_df[core_metadata_df[CELL_TYPE].isin(EPIATLAS_16_CT)].copy()

limited_pred_dfs_dict = {k: v for k, v in other_ct_dfs.items() if "-ct16" in k}
In [170]:
for no_rna in [True, False]:
    # continue
    for task_name, df in limited_pred_dfs_dict.items():
        pred_w_ct = df.merge(
            meta_df, left_on="md5sum", right_on="FILE_accession", how="inner"
        )

        if no_rna:
            pred_w_ct = pred_w_ct[~pred_w_ct[ASSAY].str.contains("rna")]

        for threshold in [0, 0.6, 0.8]:
            sub_df = pred_w_ct[pred_w_ct["Max pred"] >= threshold]

            true, pred = sub_df[CELL_TYPE], sub_df["Predicted class"]
            cm = confusion_matrix(true, pred, labels=EPIATLAS_16_CT)

            filename = f"{task_name}-core-confusion_matrix-{threshold*100}"
            if no_rna:
                final_filename = f"{filename}-no_rna"
                this_logdir = conf_matrix_logdir / "no_rna"
                this_logdir.mkdir(parents=True, exist_ok=True)
            else:
                final_filename = filename
                this_logdir = conf_matrix_logdir / "with_rna"
                this_logdir.mkdir(parents=True, exist_ok=True)

            writer = ConfusionMatrixWriter(labels=EPIATLAS_16_CT, confusion_matrix=cm)
            writer.to_all_formats(
                logdir=this_logdir,
                name=final_filename,
            )
            plt.close("all")
/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

ASSAY metrics¶

No RNA-seq here

Download note

paper_dir="/home/local/USHERBROOKE/rabj2301/Projects/epiclass/output/paper/data/training_results/dfreeze_v2/hg38_100kb_all_none/assay_epiclass_1l_3000n"
cd $paper_dir
base_path="/lustre06/project/6007515/rabyj/epiclass-project/output/epiclass-logs/epiatlas-dfreeze-v2.1/hg38_100kb_all_none/assay_epiclass_1l_3000n"
rsync -avR --exclude "*/EpiLaP/" --exclude "*.png" --exclude "*confusion*" --exclude "*.md5" narval:${base_path}/./*c/complete_no_valid_oversample .

paper_dir="/home/local/USHERBROOKE/rabj2301/Projects/epiclass/output/paper/data/training_results/dfreeze_v2"
cd $paper_dir
base_path="/lustre06/project/6007515/rabyj/epiclass-project/output/epiclass-logs/epiatlas-dfreeze-v2.1"
rsync -avR --exclude "*/EpiLaP/" --exclude "*.png" --exclude "*confusion*" --exclude "*.md5" narval:${base_path}/./hg38_100kb_all_none_w_encode_noncore/assay_epiclass_1l_3000n/complete_no_valid_oversample-0 .

find -type f -name "*.list*.csv" -print0 | xargs -0 rename 's/\.list//g'
In [171]:
data_dir = base_data_dir / "training_results" / "dfreeze_v2"
assay7_folder = (
    data_dir / f"hg38_100kb_all_none/{ASSAY}_1l_3000n/7c/complete_no_valid_oversample"
)
assay11_folder = (
    data_dir / f"hg38_100kb_all_none/{ASSAY}_1l_3000n/11c/complete_no_valid_oversample"
)
assay13_folder = (
    data_dir
    / f"hg38_100kb_all_none_w_encode_noncore/{ASSAY}_1l_3000n/13c/complete_no_valid_oversample"
)
In [172]:
pred_dfs_dict = {}
for name, folder in zip(
    ["7c", "11c", "13c"], [assay7_folder, assay11_folder, assay13_folder]
):
    if not folder.exists():
        print(f"Folder {folder} does not exist.")
        continue

    pred_folder = folder / "predictions" / "encode"
    if not pred_folder.exists():
        print(f"Folder {pred_folder} does not exist.")
        continue

    pred_file = list(pred_folder.glob("*.csv"))
    if len(pred_file) != 1:
        print(f"Found {len(pred_file)} files in {pred_folder}.")
        continue
    pred_file = pred_file[0]

    pred_df = pd.read_csv(pred_file, sep=",")
    try:
        pred_df.drop(columns=["Same?"], inplace=True)
    except KeyError:
        pass

    # Add assay metadata
    pred_df = pred_df.merge(
        complete_metadata_df, left_on="md5sum", right_on="FILE_accession", how="left"
    )

    pred_df["True class"] = pred_df["assay_epiclass"]
    pred_dfs_dict[name] = pred_df

Core7 preds¶

In [173]:
output_dir = data_dir = (
    base_data_dir
    / "training_results"
    / "predictions"
    / "encode"
    / "assay_epiclass_1l_3000n"
)
for name, df in pred_dfs_dict.items():
    # continue
    print(name)
    print(df.shape)

    # Only consider files already labeled with core7 assays
    df = df[df[ASSAY].isin(CORE_ASSAYS)]

    # Only consider non-EpiAtlas samples
    df = df[df["in_epiatlas"].astype(str) == "False"]

    # Calculate results for all predictions
    correct_pred = df["Predicted class"] == df["True class"]
    total_correct = correct_pred.sum()
    total = df.shape[0]
    perc = total_correct / total
    print(f"Acc (pred>=0.0) {total_correct}/{total} ({perc:.2%})")

    for assay in CORE_ASSAYS:
        min_pred = 0.6
        df_assay = df[df[ASSAY] == assay]
        df_assay = df_assay[df_assay["Max pred"] >= min_pred]
        correct_pred = df_assay["Predicted class"] == df_assay["True class"]
        total_correct = correct_pred.sum()
        total = df_assay.shape[0]
        perc = total_correct / total
        print(
            f"Acc (pred>={min_pred:.1f}) {assay} = {total_correct}/{total} ({perc:.2%})"
        )

    # Calculate results for predictions with max_pred > 0.6
    df_filtered = df[df["Max pred"] >= 0.6]
    correct_pred_filtered = df_filtered["Predicted class"] == df_filtered["True class"]
    total_correct_filtered = correct_pred_filtered.sum()
    total_filtered = df_filtered.shape[0]
    perc_filtered = total_correct_filtered / total_filtered
    print(
        f"Acc (pred>=0.6): {total_correct_filtered}/{total_filtered} ({perc_filtered:.2%})"
    )

    df_filtered_wrong = df_filtered[~correct_pred_filtered]
    # groupby = (
    #     df_filtered_wrong.groupby(["True class", "Predicted class"])
    #     .size()
    #     .sort_values(ascending=False)
    # )
    # display("Mislabels:", groupby)

    df_filtered_wrong.to_csv(
        output_dir / f"encode_only_mislabels_minPred0.6_{name}.csv", index=False
    )
7c
(9619, 220)
Acc (pred>=0.0) 3013/3066 (98.27%)
Acc (pred>=0.6) h3k4me3 = 378/379 (99.74%)
Acc (pred>=0.6) h3k27ac = 366/366 (100.00%)
Acc (pred>=0.6) h3k4me1 = 254/256 (99.22%)
Acc (pred>=0.6) h3k36me3 = 232/242 (95.87%)
Acc (pred>=0.6) h3k27me3 = 285/291 (97.94%)
Acc (pred>=0.6) h3k9me3 = 241/241 (100.00%)
Acc (pred>=0.6) input = 1224/1236 (99.03%)
Acc (pred>=0.6): 2980/3011 (98.97%)
11c
(9619, 224)
Acc (pred>=0.0) 2961/3066 (96.58%)
Acc (pred>=0.6) h3k4me3 = 378/378 (100.00%)
Acc (pred>=0.6) h3k27ac = 365/365 (100.00%)
Acc (pred>=0.6) h3k4me1 = 254/255 (99.61%)
Acc (pred>=0.6) h3k36me3 = 230/242 (95.04%)
Acc (pred>=0.6) h3k27me3 = 285/291 (97.94%)
Acc (pred>=0.6) h3k9me3 = 235/239 (98.33%)
Acc (pred>=0.6) input = 1146/1179 (97.20%)
Acc (pred>=0.6): 2893/2949 (98.10%)
13c
(9619, 226)
Acc (pred>=0.0) 2532/3066 (82.58%)
Acc (pred>=0.6) h3k4me3 = 377/378 (99.74%)
Acc (pred>=0.6) h3k27ac = 345/357 (96.64%)
Acc (pred>=0.6) h3k4me1 = 253/255 (99.22%)
Acc (pred>=0.6) h3k36me3 = 229/239 (95.82%)
Acc (pred>=0.6) h3k27me3 = 281/290 (96.90%)
Acc (pred>=0.6) h3k9me3 = 238/240 (99.17%)
Acc (pred>=0.6) input = 667/1037 (64.32%)
Acc (pred>=0.6): 2390/2796 (85.48%)

non-core 7c preds¶

We're using manually labeled assay categories for non-core assays.
We labeled each assay having at least 3 files, and their related family members.

We're not including CTCF in this analysis.

In [174]:
non_core_preds = pred_dfs_dict["7c"].copy(deep=True)
non_core_preds = non_core_preds[non_core_preds[ASSAY].isin(["non-core", "ctcf"])]

print(f"Non-core datasets: {non_core_preds.shape[0]}")

N_high_conf = (non_core_preds["Max pred"] >= 0.6).sum()
N_total = non_core_preds.shape[0]
print(
    f"High confidence non-core predictions: {N_high_conf / N_total:.2%} ({N_high_conf}/{N_total})"
)
Non-core datasets: 4061
High confidence non-core predictions: 72.59% (2948/4061)
In [175]:
output_dir = data_dir = (
    base_data_dir
    / "training_results"
    / "predictions"
    / "encode"
    / "assay_epiclass_1l_3000n"
)
if not output_dir.exists():
    output_dir.mkdir(parents=True)

df = non_core_preds
for min_pred in [0, 0.6, 0.8]:
    # continue
    df_filtered = df[df["Max pred"] >= min_pred]
    groupby = (
        df_filtered.groupby(["Predicted class", "assay"])
        .size()
        .reset_index(name="Count")
        .sort_values(by=["Predicted class", "Count"], ascending=[True, False])
        .set_index(["Predicted class", "assay"])["Count"]
    )
    groupby.to_csv(
        output_dir / f"encode_non-core_{name}_predictions_minPred{min_pred}.csv"
    )
In [176]:
encode_metadata_dir = base_data_dir / "metadata/encode"
non_core_categories_path = (
    encode_metadata_dir / "non-core_encode_assay_category_2024-08-29.csv"
)
if not non_core_categories_path.exists():
    raise FileNotFoundError(f"File {non_core_categories_path} does not exist.")

non_core_categories_df = pd.read_csv(
    non_core_categories_path, sep=",", names=["assay", "assay_category", "note"]
)
print(non_core_categories_df.shape)
(1171, 3)
In [177]:
df_w_cats = non_core_preds.merge(
    non_core_categories_df[["assay", "assay_category"]],
    left_on="assay",
    right_on="assay",
    how="left",
)
print(df_w_cats.shape)

df_w_cats.fillna("not_looked", inplace=True)
display(df_w_cats["assay_category"].value_counts(dropna=False))
(4061, 221)
assay_category
trx_reg        1978
not_looked     1336
insulator       526
polycomb         94
other/mixed      72
heterochrom      42
splicing         13
Name: count, dtype: int64
In [178]:
def create_non_core_preds_df(df: pd.DataFrame, min_pred: float = 0.6):
    """Create a DataFrame of non-core assay predictions."""
    results = {}
    assay_categories = dict(zip(df["assay"], df["assay_category"]))

    for assay, group in df.groupby("assay"):
        group = group[group["Max pred"] >= min_pred]

        groupby = (
            group.groupby(["Predicted class"])
            .size()
            .reset_index(name="Count")  # type: ignore
            .sort_values(by=["Count"], ascending=False)
        )

        results[assay] = dict(zip(groupby["Predicted class"], groupby["Count"]))

    result_df = pd.DataFrame(results).fillna(0)
    result_df = result_df.astype(int)
    result_df = result_df.T  # assay as row/index
    result_df["assay_category"] = result_df.index.map(assay_categories)
    return result_df
In [179]:
for min_pred in [0, 0.6, 0.8]:
    # continue
    predicted_classes_df = create_non_core_preds_df(df_w_cats, min_pred=min_pred)
    predicted_classes_df.to_csv(
        output_dir
        / f"encode_non-core_{name}_predictions_per_assay_minPred{min_pred:.2f}.csv"
    )
In [180]:
section_fig_dir = base_fig_dir / "encode_predictions" / "assay_epiclass" / "non-core"
if not section_fig_dir.exists():
    raise FileNotFoundError(f"Directory {section_fig_dir} does not exist.")
In [181]:
def create_structured_dataframe(df_w_cats):
    """Create a structured dataframe with the percentage of predictions for each assay category."""
    # Create an empty list to store our data
    data = []

    # Iterate through the grouped data
    for predicted_class, group in df_w_cats.groupby("Predicted class"):
        for min_pred in list(np.arange(0, 1, 0.05)) + [0.99]:
            df_filtered = group[group["Max pred"] >= min_pred]
            counts = df_filtered["assay_category"].value_counts(dropna=False)
            total = counts.sum()

            # Calculate percentages
            percentages = (counts / total * 100).round(2)

            # Add data for each assay category
            for assay_category, percentage in percentages.items():
                data.append(
                    {
                        "Predicted class": predicted_class,
                        "Min pred": min_pred,
                        "assay_category": assay_category,
                        "Percentage": percentage,
                        "Count": counts[assay_category],
                        "Total samples": total,
                    }
                )

    # Create the dataframe
    df_structured = pd.DataFrame(data)

    # Set the multi-index
    df_structured = df_structured.set_index(
        ["Predicted class", "Min pred", "assay_category"]
    )

    return df_structured
In [182]:
assay_category_df = create_structured_dataframe(df_w_cats)
assay_category_df.to_csv(output_dir / "encode_non-core_7c_predictions_assay_category.csv")

X = assay_category, stack = assay_epiclass¶

In [183]:
df = df_w_cats[df_w_cats["assay_category"] != "not_looked"]
print(df.shape)
print(f"{df['assay'].nunique()} unique assays.")
df_w_cats["assay"].value_counts(dropna=False)
(2725, 221)
238 unique assays.
Out[183]:
assay
ctcf        468
h3k9ac      117
polr2a      115
h3k4me2      77
h4k20me1     57
           ...
prdm6         1
glis2         1
znf546        1
mterf2        1
bnc2          1
Name: count, Length: 1170, dtype: int64
In [184]:
assay_epiclass_order = [
    "h3k27ac",
    "h3k4me3",
    "h3k4me1",
    "h3k9me3",
    "h3k27me3",
    "h3k36me3",
    "input",
]
assay_epiclass_order = {assay: i for i, assay in enumerate(assay_epiclass_order)}
In [185]:
fig_dir = section_fig_dir / "stacked_bar_X_assay_category"
fig_dir.mkdir(parents=False, exist_ok=True)

assay_categories_order = [
    "trx_reg",
    "heterochrom",
    "polycomb",
    "splicing",
    "insulator",
    "other/mixed",
]

# for min_pred in [0, 0.6, 0.8]:
for min_pred in [0.6]:
    sub_df = df[df["Max pred"] >= min_pred]
    print(f"Min pred: {min_pred}, Samples: {sub_df.shape[0]} ({sub_df.shape[0]/df.shape[0]:.2%})")
    groupby = (
        sub_df.groupby(["assay_category", "Predicted class"])
        .size()
        .reset_index(name="Count")
        .sort_values(by=["assay_category", "Count"], ascending=[True, False])
    )
    groupby["Percentage"] = groupby.groupby("assay_category")["Count"].transform(
        lambda x: (x / x.sum()) * 100
    )

    # Add order for plotting
    groupby["assay_order"] = groupby["Predicted class"].map(assay_epiclass_order)
    groupby = groupby.sort_values(
        by=["assay_category", "assay_order"], ascending=[False, True]
    )

    print(groupby, "\n")
    # Main plot
    fig = px.bar(
        groupby,
        x="assay_category",
        y="Percentage",
        color="Predicted class",
        barmode="stack",
        category_orders={"assay_category": assay_categories_order},
        color_discrete_map=assay_colors,
        title=f"core7 predictions for non-core assays, predScore >= {min_pred:.2f}",
        labels={"Percentage": "Fraction (%)", "assay_category": "Assay Category"},
    )

    # Modify x-axis labels
    total_counts = groupby.groupby("assay_category")["Count"].sum()

    ticktext = [
        f"{assay_category} (N={total_counts[assay_category]})"
        for assay_category in assay_categories_order
    ]
    fig.update_xaxes(tickvals=assay_categories_order, ticktext=ticktext)

    # Save and display
    figname = f"histogram_encode_non-core_assay_epiclass_minPred{min_pred:.2f}"
    fig.write_html(fig_dir / f"{figname}.html")
    fig.write_image(fig_dir / f"{figname}.png")
    fig.write_image(fig_dir / f"{figname}.svg")
    fig.show()
Min pred: 0.6, Samples: 2055 (75.41%)
   assay_category Predicted class  Count  Percentage  assay_order
19        trx_reg         h3k27ac    628   39.797212            0
23        trx_reg         h3k4me3    538   34.093790            1
22        trx_reg         h3k4me1    195   12.357414            2
24        trx_reg         h3k9me3      7    0.443599            3
20        trx_reg        h3k27me3      6    0.380228            4
21        trx_reg        h3k36me3      4    0.253485            5
25        trx_reg           input    200   12.674271            6
16       splicing         h3k27ac      1   10.000000            0
17       splicing        h3k36me3      7   70.000000            5
18       splicing           input      2   20.000000            6
14       polycomb         h3k4me3      7    8.536585            1
13       polycomb        h3k27me3     74   90.243902            4
15       polycomb           input      1    1.219512            6
8     other/mixed         h3k27ac     12   27.906977            0
10    other/mixed         h3k4me3     12   27.906977            1
9     other/mixed         h3k4me1      6   13.953488            2
11    other/mixed         h3k9me3      2    4.651163            3
12    other/mixed           input     11   25.581395            6
4       insulator         h3k27ac      3    0.983607            0
6       insulator         h3k4me3     46   15.081967            1
5       insulator        h3k27me3      2    0.655738            4
7       insulator           input    254   83.278689            6
0     heterochrom         h3k27ac      1    2.702703            0
1     heterochrom         h3k4me3      2    5.405405            1
2     heterochrom         h3k9me3     24   64.864865            3
3     heterochrom           input     10   27.027027            6

Assay category evolution with min_predScore¶

In [186]:
def create_assay_category_graphs(df, output_dir: Path | None = None):
    """Graph assay category distribution for each predicted class."""
    # Get unique predicted classes
    predicted_classes = df.index.get_level_values("Predicted class").unique()
    assay_categories = df.index.get_level_values("assay_category").unique()

    graph_colors = {
        cat: px.colors.qualitative.Safe[i]
        for i, cat in enumerate(sorted(assay_categories))
    }

    # Create a figure for each predicted class
    for predicted_class in predicted_classes:
        df_class = df.loc[predicted_class]

        # Get unique assay categories for this predicted class
        assay_categories = df_class.index.get_level_values("assay_category").unique()

        total_samples_at_zero = df_class.xs(0, level="Min pred")["Total samples"].iloc[0]

        # Create the figure
        fig = go.Figure()

        for assay_category in assay_categories:
            df_assay = df_class.xs(assay_category, level="assay_category")

            fig.add_trace(
                go.Scatter(
                    x=df_assay.index,
                    y=df_assay["Percentage"],
                    mode="lines+markers",
                    name=assay_category,
                    marker=dict(color=graph_colors[assay_category]),
                )
            )

        conserved_percentages = (
            df_class.groupby("Min pred")["Total samples"].first()
            / total_samples_at_zero
            * 100
        )
        fig.add_trace(
            go.Scatter(
                x=conserved_percentages.index,
                y=conserved_percentages.values,
                mode="lines+markers",
                name="Samples Conserved",
                line=dict(dash="dash", color="black"),
            )
        )

        # Update layout
        fig.update_layout(
            title=f"Composition for Predicted Class: {predicted_class}",
            xaxis_title="Min pred",
            yaxis_title="Percentage Composition",
            legend_title="Assay Category",
            hovermode="x unified",
        )

        fig.update_xaxes(range=[-0.01, 1.01])
        fig.update_yaxes(range=[0, 100])

        # Save
        if output_dir:
            filename = f"encode_non-core_7c_predictions_assay_category_{predicted_class}"
            fig.write_image(output_dir / f"{filename}.png")
            fig.write_image(output_dir / f"{filename}.svg")
            fig.write_html(output_dir / f"{filename}.html")
        fig.show()
In [187]:
# Assuming df_structured is your dataframe from the previous step
fig_dir = (
    base_fig_dir
    / "encode_predictions"
    / "assay_epiclass"
    / "non-core"
    / "line_graphs_over_min_pred"
)
fig_dir.mkdir(parents=False, exist_ok=True)
# create_assay_category_graphs(df=assay_category_df, output_dir=fig_dir)
create_assay_category_graphs(df=assay_category_df)

OTHER - Sex, life stage, cancer¶

Throwing all the predictions together to get acc/F1 for each of 5 classifiers, on core/non-core data respectively. (for assay it gets more messy, cannot do non-core directly)

In [188]:
# create new life stage classification
merge_life_stage = {
    "adult": "adult",
    "embryo": "prenatal",
    "fetal": "prenatal",
    "newborn": "prenatal",
    "child": "child",
}
all_preds_no_epiatlas = all_preds_no_epiatlas.copy(
    deep=True
)  # to avoid SettingWithCopyWarning

for label in [
    LIFE_STAGE,
    f"True class ({LIFE_STAGE})",
    f"Predicted class ({LIFE_STAGE})",
]:
    new_label = label.replace(LIFE_STAGE, f"{LIFE_STAGE}_merged")
    all_preds_no_epiatlas[new_label] = all_preds_no_epiatlas[label].map(merge_life_stage)

all_preds_no_epiatlas[f"Max pred ({LIFE_STAGE}_merged)"] = all_preds_no_epiatlas[
    f"Max pred ({LIFE_STAGE})"
]

Accuracies per assay¶

Reformat data for easy plotting¶

In [189]:
folders_to_save = [
    base_fig_dir / "encode_predictions",
    base_data_dir / "training_results" / "predictions" / "encode",
    table_dir / "dfreeze_v2" / "predictions" / "metrics",
]
In [190]:
metrics_handler = MetricsPerAssay()
In [191]:
filename = "ENCODE_7tasks_acc_per_assay_NO_EpiAtlas"
dfs_dict: Dict[str, pd.DataFrame] = metrics_handler.compute_multiple_metric_formats(  # type: ignore
    preds=all_preds_no_epiatlas,
    folders_to_save=folders_to_save,
    general_filename=filename,
    verbose=False,
    return_df=True,
)

# For future use
structured_df = dfs_dict[f"{filename}.tsv"]
In [192]:
# Keep only the 16 cell types
preds_no_epiatlas_16ct = all_preds_no_epiatlas[
    all_preds_no_epiatlas[CELL_TYPE].isin(EPIATLAS_16_CT)
]

filename = "ENCODE_7tasks_acc_per_assay_NO_EpiAtlas_16ct"
metrics_handler.compute_multiple_metric_formats(
    preds=preds_no_epiatlas_16ct,
    folders_to_save=folders_to_save,
    general_filename=filename,
    verbose=False,
)
In [193]:
# No cell ine
df = all_preds_no_epiatlas.copy(deep=True)
df[BIOMAT].fillna("unknown", inplace=True)

df = df[~df[BIOMAT].isin(["cell line", "other", "unknown"])]

filename = "ENCODE_7tasks_acc_per_assay_NO_EpiAtlas_no_cell_line"
metrics_handler.compute_multiple_metric_formats(
    preds=df,
    folders_to_save=folders_to_save,
    general_filename=filename,
    verbose=False,
)

Separate min_pred graphing¶

In [194]:
def plot_encode_metrics_per_assay(
    df_acc_per_assay: pd.DataFrame, min_pred: float = 0, logdir: Path | None = None
) -> None:
    """Plot accuracy+F1 of each classification task, per assay"""
    df = copy.deepcopy(df_acc_per_assay)

    # Selecting min_pred
    to_plot = df[
        (df["min_predScore"] > (min_pred - 0.01))
        & (df["min_predScore"] < (min_pred + 0.01))
    ]

    # sort tasks by avg_acc
    averages = to_plot[to_plot[ASSAY] == "avg-core"]
    avg_acc = list(zip(averages["acc"], averages["task_name"]))
    task_order = [
        task_name for _, task_name in sorted(avg_acc, key=lambda x: x[0], reverse=True)
    ]

    # Removing undesired assays
    to_plot = to_plot[to_plot[ASSAY].isin(CORE_ASSAYS + ["rna_seq"])]

    names = {
        f"{ASSAY}_7c": "assay_7c",
        f"{ASSAY}_11c": "assay_11c",
        LIFE_STAGE: "life stage",
        f"{LIFE_STAGE}_merged": "life stage (merged)",
        CELL_TYPE: "cell type",
        SEX: "sex",
        CANCER: "cancer",
        BIOMAT: "biomaterial_type",
    }

    # Plot each task
    for metric in ["acc", "f1-score"]:
        fig = go.Figure()
        for task_name in task_order:
            task_df = to_plot[to_plot["task_name"] == task_name]

            task_name = names[task_name]

            if task_name in ["assay_7c", "assay_11c"] and metric == "f1-score":
                continue

            fig.add_trace(
                go.Box(
                    x=[task_name] * len(task_df),
                    y=task_df[metric],
                    name=metric,
                    boxpoints="outliers",
                    boxmean=True,
                    marker_color="gray",
                    showlegend=False,
                    hoverinfo="skip",
                )
            )

            fig.add_trace(
                go.Scatter(
                    x=[task_name] * len(task_df),
                    y=task_df[metric],
                    mode="markers",
                    name=task_name,
                    marker_color=[assay_colors[assay] for assay in task_df[ASSAY]],
                    hoverinfo="text",
                    hovertext=[
                        f"{assay}: {value:.3f}"
                        for assay, value in zip(task_df[ASSAY], task_df[metric])
                    ],
                    showlegend=False,
                )
            )

        y_axis_label = "F1-score" if metric == "f1-score" else "Accuracy"
        fig.update_layout(
            xaxis_title="Classification task",
            yaxis_title=y_axis_label,
            font=dict(size=18),
            width=800,
            height=600,
            title=f"ENCODE: Task {y_axis_label} per assay (min_predScore={min_pred:.2f})",
        )

        fig.update_yaxes(range=[0, 1.01])

        # Show/Write the plot
        if logdir:
            filename = f"encode_6tasks_metrics_per_assay-{metric}-{min_pred*100:.0f}"
            fig.write_image(logdir / f"{filename}.png")
            fig.write_image(logdir / f"{filename}.svg")
            fig.write_html(logdir / f"{filename}.html")

        fig.show()
In [195]:
logdir = base_fig_dir / "encode_predictions" / "metrics_per_assay"
if not logdir.exists():
    logdir.mkdir(parents=True)

for min_pred in [0, 0.6, 0.8, 0.9]:
    plot_encode_metrics_per_assay(structured_df, min_pred=min_pred, logdir=logdir)

Multiple min_predScore¶

In [196]:
def plot_all_acc_per_assay(
    graph_df: pd.DataFrame, minY, maxY, logdir: Path | None = None
):
    """Plot accuracy per assay, per min_predScore, per scatter_name/task_name,
    for core vs non-core assays.

    """
    min_predScore_color_map = {"0.0": "blue", "0.6": "orange", "0.9": "red"}

    graph_df["scatter_name"] = graph_df["task_name"].replace(
        "harmonized_", "", regex=True
    )

    graph_df = graph_df.sort_values(by=[ASSAY, "min_predScore", "scatter_name"])

    for graph_type in ["core", "non-core"]:
        df = graph_df.copy(deep=True)
        if graph_type == "core":
            df = df[df[ASSAY].isin(CORE_ASSAYS + ["rna_seq"])]
            minY = 0.55
            maxY = 1.001
        elif graph_type == "non-core":
            df = df[~df[ASSAY].isin(CORE_ASSAYS)]
            minY = 0
            maxY = 1
        else:
            raise ValueError(f"Invalid graph type: {graph_type}")

        unique_assays = list(df[ASSAY].unique())

        # Calculate average over assays
        avg_df = df.groupby(["min_predScore", "scatter_name"])["acc"].mean().reset_index()
        avg_df[ASSAY] = "Average"

        # traces_per_assay = df["scatter_name"].nunique()

        fig = go.Figure()

        for min_pred in ["0.0", "0.6", "0.9"]:
            df_subset = df[df["min_predScore"] == min_pred]
            avg_subset = avg_df[avg_df["min_predScore"] == min_pred]

            # Add average over assay trace
            fig.add_trace(
                go.Scatter(
                    x=["Average - " + name for name in avg_subset["scatter_name"]],
                    y=avg_subset["acc"],
                    mode="markers",
                    name=f"Avg Min Pred Score: {min_pred}",
                    marker=dict(
                        color=min_predScore_color_map[min_pred],
                        size=9,
                        symbol="star",
                    ),
                    hoverinfo="y+x",
                    showlegend=False,
                )
            )

            # Add individual assay traces
            hovertext = list(
                zip(
                    df_subset[ASSAY],
                    df_subset["nb_samples"].apply(lambda x: f"Samples: {x}"),
                )
            )
            fig.add_trace(
                go.Scatter(
                    x=df_subset[ASSAY] + " - " + df_subset["scatter_name"],
                    y=df_subset["acc"],
                    mode="markers",
                    name=f"Min Pred Score: {min_pred}",
                    marker=dict(
                        color=min_predScore_color_map[min_pred],
                        size=9,
                    ),
                    text=hovertext,
                    hoverinfo="text+y+x",
                )
            )

        # Modify x-axis tick labels

        ticktext = []
        tick_group = list(df_subset["scatter_name"].unique())
        for i, tick in enumerate(tick_group):
            tick_group[i] = f"<b>{tick}</b>"

        for i in range(len(unique_assays) + 1):
            ticktext.extend(tick_group)

        fig.update_xaxes(
            tickmode="array", ticktext=ticktext, tickvals=list(range(len(ticktext)))
        )

        # Add assay labels on top + vertical lines between assay groups
        fig.add_annotation(
            x=len(tick_group) / 2 - 0.5,
            y=1.05,
            yref="paper",
            text="Average",
            showarrow=False,
            font=dict(size=14),
        )

        fig.add_vline(
            x=len(tick_group) - 0.5, line_width=2, line_dash="solid", line_color="black"
        )
        fig.add_hline(y=1, line_width=1, line_color="black")

        for i, label in enumerate(unique_assays):
            fig.add_annotation(
                x=(i + 1) * len(tick_group) + len(tick_group) / 2 - 0.5,
                y=1.05,
                yref="paper",
                text=label,
                showarrow=False,
                font=dict(size=14),
            )
            fig.add_vline(
                x=(i + 1) * len(tick_group) - 0.5,
                line_width=1,
                line_dash="dash",
                line_color="black",
            )

        # titles + yaxis range
        fig.update_layout(
            title="ENCODE data - Label match per Assay and Task",
            xaxis_title="Assay - Task",
            yaxis_title="Match %",
            xaxis_tickangle=-45,
            showlegend=True,
            height=600,
            width=1200,
            yaxis=dict(tickformat=".2%", range=[minY, maxY]),
        )

        # Show/Write the plot
        print(f"Graphing {graph_type}")
        if logdir:
            figname = f"encode_{graph_type}_acc_per_assay_minY{minY:.2f}"
            fig.write_html(logdir / f"{figname}.html")
            fig.write_image(logdir / f"{figname}.png")
            fig.write_image(logdir / f"{figname}.svg")
        fig.show()
In [197]:
# this_fig_dir = base_fig_dir / "encode_predictions" / "acc_per_assay"
# if not this_fig_dir.exists():
#     raise FileNotFoundError(f"Folder {this_fig_dir} does not exist")

Confusion matrices¶

In [198]:
def graph_confusion_matrix(all_preds: pd.DataFrame, output_dir: Path):
    """Graph confusion matrix for each classification task, for both core and non-core assays"""
    df = copy.deepcopy(all_preds)

    assay_merger = {
        "mrna_seq": "rna_seq",
        "wgbs-pbat": "wgbs",
        "wgbs-standard": "wgbs",
    }
    for col in df.columns:
        if ASSAY in col:
            df[col].replace(assay_merger, inplace=True)

    for graph_type in ["core", "non-core"]:
        print(f"Graphing {graph_type}")
        if graph_type == "core":
            sub_df = df[df[ASSAY].isin(ASSAY_ORDER)]
        elif graph_type == "non-core":
            sub_df = df[df[ASSAY].isin(["ctcf", "non-core"])]
        else:
            raise ValueError(f"Invalid graph_type: {graph_type}")

        for name in [
            f"{ASSAY}_7c",
            f"{ASSAY}_11c",
            CELL_TYPE,
            SEX,
            LIFE_STAGE,
            f"{LIFE_STAGE}_merged",
            CANCER,
        ]:
            logdir = output_dir / name
            if not logdir.exists():
                logdir.mkdir(parents=True)

            if name == CELL_TYPE and graph_type == "core":
                continue

            if name in [f"{ASSAY}_7c", f"{ASSAY}_11c"] and graph_type == "non-core":
                continue

            y_true_col = f"True class ({name})"
            y_pred_col = f"Predicted class ({name})"
            max_pred_label = f"Max pred ({name})"

            task_df = sub_df.copy(deep=True)
            task_df = task_df.fillna("unknown")
            for col in [y_true_col, y_pred_col]:
                task_df = task_df[task_df[col] != "unknown"]

            if name == CELL_TYPE:
                task_df = task_df[task_df[CELL_TYPE].isin(EPIATLAS_16_CT)]

            print(name, task_df.shape)
            for threshold in [0, 0.6, 0.8, 0.9]:
                filtered_df = task_df[task_df[max_pred_label] >= threshold]

                if filtered_df.shape[0] == 0:
                    print(f"No data for {name} with threshold {threshold}. Skipping.")
                    continue

                true, pred = filtered_df[y_true_col], filtered_df[y_pred_col]
                if name == CELL_TYPE:
                    labels = EPIATLAS_16_CT
                else:
                    labels = set(true.unique()) | set(pred.unique())
                    labels = sorted(labels)

                cm = confusion_matrix(true, pred, labels=labels)

                writer = ConfusionMatrixWriter(labels=labels, confusion_matrix=cm)
                writer.to_all_formats(
                    logdir=logdir,
                    name=f"{name}-{graph_type}-confusion_matrix-{threshold*100:.0f}",
                )
                plt.close("all")
In [199]:
cm_logdir = base_fig_dir / "encode_predictions" / "confusion_matrices"
graph_confusion_matrix(all_preds_no_epiatlas, cm_logdir)
Graphing core
assay_epiclass_7c (3066, 280)
assay_epiclass_11c (4716, 280)
harmonized_donor_sex (4157, 280)
/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

harmonized_donor_life_stage (4036, 280)
/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

harmonized_donor_life_stage_merged (3208, 280)
harmonized_sample_cancer_high (1986, 280)
Graphing non-core
harmonized_sample_ontology_intermediate (262, 280)
/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

harmonized_donor_sex (3990, 280)
/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

harmonized_donor_life_stage (3895, 280)
/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

/home/local/USHERBROOKE/rabj2301/Projects/sources/epi_ml/src/python/epi_ml/core/confusion_matrix.py:135: RuntimeWarning:

invalid value encountered in true_divide

harmonized_donor_life_stage_merged (3098, 280)
harmonized_sample_cancer_high (4061, 280)

track type¶

In [200]:
track_type_pred_path = (
    base_data_dir
    / "training_results"
    / "predictions"
    / "encode"
    / "track_type"
    / "split0_test_prediction_100kb_all_none_all.list.csv"
)
track_type_pred_df = pd.read_csv(track_type_pred_path)
In [201]:
pred_vector_cols = list(track_type_pred_df.columns[3:])
track_type_pred_df["Max_pred_track_type"] = track_type_pred_df.loc[
    :, pred_vector_cols
].max(axis=1)
In [202]:
track_type_df = track_type_pred_df.merge(
    complete_metadata_df, left_on="Unnamed: 0", right_on="FILE_accession", how="inner"
)

print(track_type_df.shape, encode_df.shape, track_type_pred_df.shape)
(9601, 219) (252, 5) (9619, 13)
In [203]:
# write each table in a separate excel sheet
output = track_type_pred_path.parent / "track_type_predictions_pivot.csv"
output.unlink(missing_ok=True)

verbose = False

with open(output, "a", encoding="utf8") as csv_stream:
    for min_pred in [0, 0.6, 0.8]:
        df = track_type_df[track_type_df["Max_pred_track_type"] >= min_pred]
        pivot = df.pivot_table(
            index=ASSAY,
            columns="Predicted class",
            values="Max_pred_track_type",
            aggfunc="count",
            fill_value=0,
            margins=True,
        ).astype(int)
        relative_pivot = pivot.div(pivot["All"], axis=0) * 100

        csv_stream.write(f"Count Pivot - Min pred: {min_pred}\n")
        pivot.to_csv(csv_stream)
        csv_stream.write("\n")

        csv_stream.write(f"Relative Pivot - Min pred: {min_pred}\n")
        relative_pivot.to_csv(csv_stream)
        csv_stream.write("\n")

        if verbose:
            display(pivot)
            # pylint: disable=consider-using-f-string
            with pd.option_context("display.float_format", "{:.2f}".format):
                display(relative_pivot)

RNA-Seq Assay¶

In [204]:
rna_df = all_preds_no_epiatlas[all_preds_no_epiatlas[ASSAY].str.contains("rna")].copy(
    deep=True
)
In [205]:
print("RNA-Seq assay accuracy, if mrna_seq != rna_seq\n")
assay_label = f"{ASSAY}_11c"
for min_pred in [0, 0.6, 0.8]:
    df = rna_df[rna_df[f"Max pred ({assay_label})"] >= min_pred]
    acc = len(
        df[df[f"True class ({assay_label})"] == df[f"Predicted class ({assay_label})"]]
    ) / len(df)
    print(
        f"Min pred: {min_pred}, Accuracy: {acc:.4f}. Samples: {len(df)}/{rna_df.shape[0]}\n"
    )
    groupby = (
        df.groupby([ASSAY, f"Predicted class ({assay_label})"])
        .size()
        .reset_index()
        .rename(columns={0: "Count"})
        .sort_values([ASSAY, "Count"], ascending=False)
    )
    print(groupby, "\n")
RNA-Seq assay accuracy, if mrna_seq != rna_seq

Min pred: 0, Accuracy: 0.9265. Samples: 1552/1552

  assay_epiclass Predicted class (assay_epiclass_11c)  Count
3        rna_seq                              rna_seq    885
2        rna_seq                             mrna_seq     63
0       mrna_seq                             mrna_seq    553
1       mrna_seq                              rna_seq     51

Min pred: 0.6, Accuracy: 0.9427. Samples: 1483/1552

  assay_epiclass Predicted class (assay_epiclass_11c)  Count
3        rna_seq                              rna_seq    877
2        rna_seq                             mrna_seq     47
0       mrna_seq                             mrna_seq    521
1       mrna_seq                              rna_seq     38

Min pred: 0.8, Accuracy: 0.9514. Samples: 1357/1552

  assay_epiclass Predicted class (assay_epiclass_11c)  Count
3        rna_seq                              rna_seq    835
2        rna_seq                             mrna_seq     37
0       mrna_seq                             mrna_seq    456
1       mrna_seq                              rna_seq     29